MegaMUGA Annotations

# Reading in "control genotype" data
megamuga_controls <- data.table::fread("../data/MMControls/MegaMUGA genotypes CONTROLS.csv")
control_genotypes <- data.table::fread("../data/MMControls/control.genotypes.csv")
## Warning in data.table::fread("../data/MMControls/control.genotypes.csv"):
## Detected 364 column names but the data has 365 columns (i.e. invalid file).
## Added 1 extra default column name for the first column which is guessed to be
## row names or an index. Use setnames() afterwards if this guess is not correct,
## or fix the file write command that created the file to create a valid file.
colnames(control_genotypes)[1] <- "marker"

## Reading in marker annotations
mm_metadata <- data.table::fread("../data/MMControls/mm_uwisc_v2.csv")

## Calculating allele frequencies for each marker
control_allele_freqs <- control_genotypes %>%
  tidyr::pivot_longer(-marker, names_to = "sample", values_to = "genotype") %>%
  dplyr::group_by(marker, genotype) %>%
  dplyr::summarise(n = n()) %>%
  dplyr::mutate(freq = round(n/sum(n), 3),
                genotype = as.factor(genotype)) %>%
  dplyr::left_join(., mm_metadata)

First, we searched for probes where many mice are missing genotype calls.

## Filtering to markers with missing genotypes
no.calls <- control_allele_freqs %>%
  dplyr::filter(genotype == "N") %>%
  tidyr::pivot_wider(names_from = genotype, values_from = n) %>%
  dplyr::select(marker, chr, bp_grcm39, freq) %>%
  dplyr::mutate(chr = as.factor(chr))
cutoff <- quantile(no.calls$freq, probs = seq(0,1,0.05))[[20]]
above.cutoff <- no.calls %>%
  dplyr::filter(freq > cutoff)

Of 77808 markers, 54990 failed to genotype at least one sample, and 2750 markers failed to genotype at least 12.765% of samples.

In a similar fashion, we calculated the number of reference samples with missing genotypes. Repeated observations of samples/strains meant that genotype counts for each marker among them couldn’t be grouped and tallied, so determining no-call frequency occurred column-wise.

n.calls.strains <- apply(X = control_genotypes[,2:ncol(control_genotypes)], MARGIN = 2, function(x) table(x)[5])
n.calls.strains.df <- data.frame(n.calls.strains)
n.calls.strains.df$sample <- names(n.calls.strains)
n.calls.strains.df %<>%
  dplyr::rename(n.no.calls = n.calls.strains)

sampleQC <- ggplot(n.calls.strains.df, 
                   mapping = aes(x = reorder(sample,n.no.calls), 
                                 y = n.no.calls,
                                 text = paste("Sample:", sample))) + 
  geom_point() +
  QCtheme + 
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  labs(x = "Number of mice with missing genotypes",
       y = "Number of markers")
ggplotly(sampleQC, tooltip = c("text","y"))
high.n.samples <- n.calls.strains.df %>%
  dplyr::filter(n.no.calls > quantile(n.calls.strains.df$n.no.calls, probs = seq(0,1,0.05))[20])

We next validated the sexes of each sample using Chromosome X probe intensities. We paired up probe intensities, joined available metadata, and filtered down to only markers covering the X chromosome.

## Reading in genotype intensities
x_intensities <- data.table::fread("../data/MMControls/control.X.csv")
## Warning in data.table::fread("../data/MMControls/control.X.csv"): Detected 364
## column names but the data has 365 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
colnames(x_intensities)[1] <- "marker"
y_intensities <- data.table::fread("../data/MMControls/control.Y.csv")
## Warning in data.table::fread("../data/MMControls/control.Y.csv"): Detected 364
## column names but the data has 365 columns (i.e. invalid file). Added 1 extra
## default column name for the first column which is guessed to be row names or an
## index. Use setnames() afterwards if this guess is not correct, or fix the file
## write command that created the file to create a valid file.
colnames(y_intensities)[1] <- "marker"
## Check to see if dimensions of intensity tables are identical, marker orders identical, and sample orders identical
if((unique(dim(x_intensities) == dim(y_intensities)) && 
   unique(colnames(x_intensities) == colnames(y_intensities)) &&
   unique(x_intensities$marker == y_intensities$marker)) == TRUE){
     ## Pivoting the data longer
  x_int_long <- x_intensities %>%
  tidyr::pivot_longer(cols = -marker, names_to = "sample", values_to = "x_int")
  y_int_long <- y_intensities %>%
  tidyr::pivot_longer(cols = -marker, names_to = "sample", values_to = "y_int")
  
  long_intensities <- cbind(x_int_long, y_int_long)
} else {
     print("Source intensity data frames have non-identical structure; exiting")
}

## Joining slimmer intensity files with marker metadata and reducing to Chromosome X markers
long_X_intensities <- long_intensities[,c(1,2,3,6)] %>%
  dplyr::left_join(., mm_metadata) %>%
  dplyr::filter(chr == "X")

Then we flagged markers with high missingness across all samples and samples with high missigness among all markers.

## Flagging markers and samples based on previous QC steps
flagged_X_intensities <- long_X_intensities %>%
  dplyr::mutate(marker_flag = dplyr::if_else(condition = marker %in% above.cutoff$marker,
                                             true = "FLAG",
                                             false = "")) %>%
  dplyr::mutate(high_missing_sample = dplyr::if_else(condition = sample %in% high.n.samples$sample,
                                                     true = "FLAG",
                                                     false = ""))

The first round of inferring predicted sexes used a rough search of the sample name for a proper convention among F1 hybrids, as well as an increasingly large regex search into the sample ID to identify properly named samples from inbred strains.

## First round of predicted sex inference
prelim.predicted.sexes <- flagged_X_intensities %>%
  dplyr::mutate(bg = dplyr::case_when(stringr::str_detect(string = sample, 
                                                          pattern = "F1") == TRUE ~ "F1",
                                      TRUE ~ as.character(sample)),
                predicted.sex = dplyr::case_when(stringr::str_detect(string = sample, 
                                                          pattern = "F1f") == TRUE ~ "f",
                                      stringr::str_detect(string = sample, 
                                                          pattern = "F1m") == TRUE ~ "m",
                                      TRUE ~ "unknown"))

unknown <- prelim.predicted.sexes %>%
  dplyr::filter(predicted.sex == "unknown")

digit.trim <- unknown %>% 
  dplyr::mutate(mouse.id.1 = stringr::str_sub(sample, -1),
                predicted.sex.1 = dplyr::case_when(stringr::str_sub(mouse.id.1, 1, 1) %in% c("m","M") ~ "m",
                                                   stringr::str_sub(mouse.id.1, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.1 == "m" | predicted.sex.1 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.1, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.3= stringr::str_sub(sample, -3),
                predicted.sex.3 = dplyr::case_when(stringr::str_sub(mouse.id.3, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.3, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.3 == "m" | predicted.sex.3 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.3, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.4 = stringr::str_sub(sample, -4),
                predicted.sex.4 = dplyr::case_when(stringr::str_sub(mouse.id.4, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.4, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.4 == "m" | predicted.sex.4 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.4, 
                                                replacement = ""), 
                             false = bg),
                
                mouse.id.5 = stringr::str_sub(sample, -5),
                mouse.id.5 = stringr::str_replace(string = mouse.id.5,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:symbol:]", 
                                                  replacement = ""),
                predicted.sex.5 = dplyr::case_when(stringr::str_sub(mouse.id.5, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.5, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.5 == "m" | predicted.sex.5 == "f"),
                             true = str_replace(string = bg,
                                                pattern = mouse.id.5,
                                                replacement = ""),
                             false = bg),
                
                mouse.id.6 = stringr::str_sub(sample, -6),
                mouse.id.6 = stringr::str_replace(string = mouse.id.6,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:punct:]", 
                                                  replacement = ""),
                mouse.id.6 = stringr::str_replace(string = mouse.id.6,  ## a couple symbols in these ids mess up the regex search
                                                  pattern = "[:symbol:]", 
                                                  replacement = ""),
                predicted.sex.6 = dplyr::case_when(stringr::str_sub(mouse.id.6, 1, 1) %in% c("m","M") ~ "m",
                                                 stringr::str_sub(mouse.id.6, 1, 1) %in% c("f","F") ~ "f",
                                                 TRUE ~ "unknown"),
                bg = if_else(condition = (predicted.sex.6 == "m" | predicted.sex.6 == "f"), 
                             true = str_replace(string = bg, 
                                                pattern = mouse.id.6, 
                                                replacement = ""), 
                             false = bg)) %>%
  dplyr::mutate(predicted.sex = dplyr::case_when(predicted.sex.1 == "m" ~ "m", 
                                                 predicted.sex.3 == "m" ~ "m",
                                                 predicted.sex.4 == "m" ~ "m",
                                                 predicted.sex.5 == "m" ~ "m",
                                                 predicted.sex.6 == "m" ~ "m",
                                                 
                                                 predicted.sex.1 == "f" ~ "f", 
                                                 predicted.sex.3 == "f" ~ "f",
                                                 predicted.sex.4 == "f" ~ "f",
                                                 predicted.sex.5 == "f" ~ "f",
                                                 predicted.sex.6 == "f" ~ "f",
                                                 TRUE ~ "unknown"))

predicted.sexes.strings <- prelim.predicted.sexes %>%
  dplyr::filter(predicted.sex != "unknown") %>%
  dplyr::bind_rows(.,digit.trim)

## Taking the first marker as a sample
predicted.sex.table <- predicted.sexes.strings %>%
  dplyr::filter(marker %in% unique(prelim.predicted.sexes$marker)[1]) %>%
  dplyr::select(sample, predicted.sex, bg) %>%
  dplyr::group_by(predicted.sex) %>%
  dplyr::count() 

This captured 94 female samples, 249 male samples, leaving 21 samples of unknown predicted sex from nomenclature alone. These samples exhibit a range of naming inconsistencies.

predicted.sexes.strings %>%
  dplyr::filter(predicted.sex == "unknown",
                marker == predicted.sexes.strings$marker[[1]]) %>%
  dplyr::select(sample, bg)
##               sample                bg
## 1           34-HG-F1                F1
## 2           36-PG-F1                F1
## 3             BAG74u            BAG74u
## 4              BAG99             BAG99
## 5           CAST/EiJ          CAST/EiJ
## 6               IN13              IN13
## 7               IN25              IN25
## 8               IN34              IN34
## 9               IN40              IN40
## 10              IN47              IN47
## 11              IN54              IN54
## 12 KOMP cell DNA JM8 KOMP cell DNA JM8
## 13 KOMP cell DNA JM8 KOMP cell DNA JM8
## 14 KOMP cell DNA JM8 KOMP cell DNA JM8
## 15           MWN1026           MWN1026
## 16           MWN1030           MWN1030
## 17           MWN1194           MWN1194
## 18           MWN1198           MWN1198
## 19          MWN1214u          MWN1214u
## 20           MWN1279           MWN1279
## 21           MWN1287           MWN1287